In this course we will try to predict from the isavuconazole simulation a trough concentration at SS higher than 2 mg/L
library(tidyverse)
library(tidymodels)
library(skimr)# pour summary
library(GGally)# pour graphique summary
library(tableone)# pour stat descriptive
tidymodels_prefer()
library(DALEXtra)# pour explicability
library(themis)# pour downsample
library(DataExplorer)
#library(recipeselectors)
library(naniar)
library(finetune)
library(stacks)
library(vip)
isavuco %>% ggplot(aes(x = time, y = out)) +
geom_point(alpha = 0.1) +
labs(title = "Individual simulated isavuconazole profiles", x = "'Time (h)", y = "Isavuconazole concentrations (mg/L)") +
theme_bw()
Extraction of C0 at 192h
#histogram of c0 at SS
isavuco %>%
filter(time == 192) %>%
ggplot(aes(x = out)) +
geom_histogram() +
labs(title = "Distribution of the simulated C0 at steady state", x = "'C0 isavuconazole") +
theme_bw() +
geom_vline(xintercept = 2)# +
# scale_x_continuous(trans=scales::pseudo_log_trans(base = 10))
# summarise of C0 distribution
isavuco %>%
filter(time == 192) %>%
summarise(mean_c0 = mean(out), min_c0 = min(out), max_c0 = max(out))
## # A tibble: 1 × 3
## mean_c0 min_c0 max_c0
## <dbl> <dbl> <dbl>
## 1 4.13 0.139 22.2
# extraction of C0
creation_event <- isavuco %>%
filter(time == 192) %>%
mutate(event_numeric = if_else(out > 2, 1, 0), event = as.factor(event_numeric)) %>%
select(id, event, event_numeric)
# plot proportion
creation_event %>%
ggplot((aes(x=event))) +
geom_bar() +
theme_bw()
# proportion
creation_event %>% summarise(prop_event = mean(event_numeric, na.rm=TRUE))
## # A tibble: 1 × 1
## prop_event
## <dbl>
## 1 0.751
creation_event <- creation_event %>% select(-event_numeric)
Extraction of concentrations used as predictor and creation of binned column We will try to predict from concentrations at 48h (trough) to 60h and make selection of important features based on boruta algorithm
isavuco_pred <- isavuco %>% filter(between(time,48,60)) %>% mutate(
## creation of binned column for time remove after 6h for feasability purpose
time_bin = case_when(
time == 48 ~ "conc_0",
time == 49 ~"conc_1",
time == 50 ~"conc_2",
time == 51 ~ "conc_3",
time == 52 ~"conc_4",
time == 53 ~"conc_5",
time == 54 ~ "conc_6",
TRUE ~ NA
# time == 55 ~"conc_7",
# time == 56 ~"conc_8",
# time == 57 ~ "conc_9",
# time == 58 ~"conc_10",
# time == 59 ~"conc_11",
# time == 60 ~"conc_12"
)) %>%
filter(!is.na(time_bin))
## calculation of relative difference between theoretical time and observed time
# diff_rel_time = case_when(
# time==0 ~ 0,
# between(time,0.85,1.15) ~ (time - 1)/1,
# between(time,2.8,3.2) ~ (time - 3)/3,
# TRUE~100)) %>% filter(diff_rel_time<50) %>% mutate_if(is.character,factor)
#
## filter patient with exactly 2 rows for the 2 times
# isavuco_pred %>% group_by(id) %>% filter(n() == 2) %>% ungroup()
# pivot conc
isavuco_pivot <- isavuco_pred %>% pivot_wider(id_cols=id ,names_from = time_bin, values_from = c(out)) %>%
## join with variable non pivot
left_join(isavuco_pred %>% dplyr::select(id, dose_group:dose)%>% distinct(id,.keep_all = T))
# merge with AUC
isavuco_ml <- isavuco_pivot %>% left_join(creation_event)
# pivot conc for variable selection
isavuco_pivot_all <- isavuco_pred %>% pivot_wider(id_cols=id ,names_from = time_bin, values_from = c(out)) %>%
## join with variable non pivot
left_join(isavuco_pred %>% select(-time_bin)) %>%
#oadd the event
left_join(creation_event %>% select(id, event)) %>%
#remove unusefull features
select( -time, -dose_group, -out) %>%
distinct(conc_0, .keep_all = TRUE)
rf_model <- rand_forest(mode = "classification", trees = 500) %>%
set_engine("ranger", importance = "permutation")
rf_fit <- rf_model %>%
fit(event ~ ., data = isavuco_pivot_all)
vip(rf_fit) # Plot feature importance
How It Works: Create Shadow Features The method duplicates the original dataset’s predictor variables. The duplicated features are randomly shuffled (these are called shadow features). These shadow features have no relationship with the outcome variable. Train a Random Forest Model A random forest classifier is trained on both real and shadow features. The model calculates the importance of each feature (real & shadow) based on how well they help in making predictions. Compare Real vs. Shadow Features Boruta checks whether a real feature is more important than the best shadow feature. If a real feature has higher importance than all shadow features, it is kept. If a real feature isn’t better than shadow features, it is removed.
# library(Boruta)
#
# boruta_model <- Boruta(event ~ ., data = isavuco_pivot_all, doTrace = 2)
# final_features <- getSelectedAttributes(boruta_model, withTentative = TRUE)
#
# print(final_features) # List of selected features
# library(caret)
#
# control <- rfeControl(functions = rfFuncs, method = "cv", number = 5)
#
# rfe_model <- rfe(isavuco_pivot_all %>% select(-id, -event), isavuco_pivot_all$event,
# sizes = c(2,3,4,5, 10), rfeControl = control)
#
# print(rfe_model) # Selected features
package under developpement
step_select_boruta
# # create a preprocessing recipe
# rec <-
# recipe(event ~ ., data = isavuco_ml ) %>%
# step_select_boruta(all_predictors(), outcome = "event")
#
# prepped <- prep(rec)
# prepped
#
# preproc_data <- juice(prepped)
# preproc_data
Relief calculates a feature score for each feature which can then be applied to rank and select top scoring features for feature selection. Alternatively, these scores may be applied as feature weights to guide downstream modeling. Relief feature scoring is based on the identification of feature value differences between nearest neighbor instance pairs. If a feature value difference is observed in a neighboring instance pair with the same class (a ‘hit’), the feature score decreases. Alternatively, if a feature value difference is observed in a neighboring instance pair with different class values (a ‘miss’), the feature score increases.
# rec <-
# recipe(event ~ ., data = isavuco_ml) %>%
# step_select_relief(
# all_predictors(),
# outcome = "auc",
# # top_p = 10,
# threshold = 0.8
# )
#
# prepped <- prep(rec)
#
# new_data <- bake(prepped, new_data = NULL)
# new_data
Features can be selected in many different ways. One scheme is to select features that correlate strongest to the classification variable. This has been called maximum-relevance selection. Many heuristic algorithms can be used, such as the sequential forward, backward, or floating selections.
On the other hand, features can be selected to be mutually far away from each other while still having “high” correlation to the classification variable. This scheme, termed as Minimum Redundancy Maximum Relevance (mRMR) selection has been found to be more powerful than the maximum relevance selection.
As a special case, the “correlation” can be replaced by the statistical dependency between variables. Mutual information can be used to quantify the dependency. In this case, it is shown that mRMR is an approximation to maximizing the dependency between the joint distribution of the selected features and the classification variable.
step_select_mrmr
# rec <-
# recipe(event ~ ., data = isavuco_ml) %>%
# step_select_mrmr(
# all_predictors(),
# outcome = "auc",
# # top_p = 5,
# threshold = 0.85
# )
#
# prepped <- prep(rec)
# prepped
#
# preproc_data <- juice(prepped)
#
# preproc_data
Create a custom metric f_meas_with beta=2
# Vectorized function for F-measure with beta = 2
f_meas_beta2_vec <- function(truth, estimate, na_rm = TRUE, ...) {
yardstick::f_meas_vec(truth, estimate, beta = 2, na_rm = na_rm, ...)
}
# Data frame function for F-measure with beta = 2
f_meas_beta2 <- function(data, truth, estimate, na_rm = TRUE, ...) {
truth <- rlang::enquo(truth)
estimate <- rlang::enquo(estimate)
yardstick::metric_summarizer(
metric_nm = "f_meas_beta2",
metric_fn = f_meas_beta2_vec,
data = data,
truth = !!truth,
estimate = !!estimate,
na_rm = na_rm,
...
)
}
# Register the custom metric
f_meas_beta2 <- yardstick::new_class_metric(f_meas_beta2, direction = "maximize")
Calculation of differences between the remaining concentrations)
isavuco_ml <- isavuco_ml %>%
mutate(diff_conc = conc_1 - conc_0,
ratio_conc = conc_1/conc_0) %>%
select(id, conc_0, conc_1, dose_group:event,diff_conc, ratio_conc)
Graphical exploraiton boxplots package DataExplorer
# boxplots
plot_boxplot(isavuco_ml , by ="event", scale_y = "log10", ggtheme = theme_bw())
# variables continues
plot_histogram(isavuco_ml, scale_x = "log10", ggtheme = theme_bw())
# variables categorielle
# par sous-groupes
plot_bar(isavuco_ml, by ="event")
ggp <- isavuco_ml %>% select(-id) %>% ggpairs(aes(color = event))
print(ggp, progress = FALSE)
# isavuco_ml %>% ggplot(aes(x = conc_0, fill=event)) + geom_density(alpha=0.5) + theme_bw()
# isavuco_ml %>% ggplot(aes(x = conc_1, fill=event)) + geom_density(alpha=0.5) + theme_bw()
# isavuco_ml %>% ggplot(aes(x = dose, fill=event)) + geom_density(alpha=0.5) + theme_bw()
# isavuco_ml %>% ggplot(aes(x = diff_conc, fill=event)) + geom_density(alpha=0.5) + theme_bw()
# Load necessary libraries
library(ggrepel) # For labeling points in PCA plot
# Assuming your dataset is called `isavuco_ml_train`
# Select relevant columns (continuous variables and event for coloring)
pca_data <- isavuco_ml %>%
select(conc_0, conc_1, weight, age, dose, diff_conc, ratio_conc, event) %>%
mutate(event = as.factor(event)) # Ensure event is a factor for coloring
# Define PCA recipe
pca_recipe <- recipe(~., data = pca_data) %>%
step_normalize(all_numeric_predictors()) %>% # Normalize numeric variables
step_pca(all_numeric_predictors(), threshold = 0.9) # 90% of variability conserved
# Prepare the recipe
pca_prep <- prep(pca_recipe)
# Apply PCA transformation
pca_transformed <- bake(pca_prep, new_data = pca_data)
# Combine PCA results with event variable
pca_transformed <- pca_transformed %>%
bind_cols(select(pca_data))
# Plot PCA results
ggplot(pca_transformed, aes(x = PC1, y = PC2, color = event)) +
geom_point(alpha = 0.8, size = 3) + # Scatter plot of PCA
scale_color_manual(values = c("red", "blue")) + # Color by event (0=red, 1=blue)
labs(title = "PCA Analysis of Continuous Variables",
x = "Principal Component 1",
y = "Principal Component 2",
color = "event") +
theme_minimal() +
theme(legend.position = "top")
# library(embed)
# # Define umap recipe
# umap_recipe <- recipe(~., data = pca_data) %>%
# step_normalize(all_numeric_predictors()) %>% # Normalize numeric variables
# step_umap(all_numeric_predictors(), num_comp = 2) # 2 component
# -
# # Prepare the recipe
# umap_prep <- prep(umap_recipe)
#
# # Apply umap transformation
# umap_transformed <- bake(umap_prep, new_data = umap_data)
#
# # Combine umap results with event variable
# umap_transformed <- umap_transformed %>%
# bind_cols(select(umap_data, event))
#
# # Plot umap results
# ggplot(umap_transformed, aes(x = PC1, y = PC2, color = event)) +
# geom_point(alpha = 0.8, size = 3) + # Scatter plot of umap
# scale_color_manual(values = c("red", "blue")) + # Color by event (0=red, 1=blue)
# labs(title = "umap Analysis of Continuous Variables",
# x = "Principal Component 1",
# y = "Principal Component 2",
# color = "Event") +
# theme_minimal() +
# theme(legend.position = "top")
On va transformer les conc, la dose et les diff entre les conc
set.seed(1234)
isavuco_split<- initial_split(isavuco_ml, strata = event, prop=3/4)
isavuco_ml_train <- training(isavuco_split )
isavuco_ml_test <- testing(isavuco_split )
#cut the 2 vairbales with their names
isavuco_dev<-isavuco_ml_train %>% mutate (type = "dev")
isavuco_val<-isavuco_ml_test %>% mutate (type = "val")
isavuco_des<-full_join(isavuco_dev, isavuco_val)
#recuperation des noms
# dput(names((isavuco_ml_train)))
## Vector of categorical variables that need transformation
catVars <- c("dose_group", "event")
## Create a variable list.
vars <- c("conc_0", "conc_1", "dose_group", "weight", "age", "dose",
"event", "diff_conc")
tableOne <- CreateTableOne(vars = vars, strata = "type",factorVars = catVars, data = isavuco_des)
tableOne2<-print(tableOne, nonnormal = c("conc_0", "conc_1", "weight", "age", "dose", "diff_conc"), printToggle=F, minMax=T)
tableOne2b<-print(tableOne, nonnormal = c("conc_0", "conc_1", "weight", "age", "dose", "diff_conc"), printToggle=F, minMax=F)
| dev | val | p | test | |
|---|---|---|---|---|
| n | 672 | 225 | ||
| conc_0 (median [range]) | 6.18 [0.07, 36.96] | 5.78 [0.12, 32.74] | 0.627 | nonnorm |
| conc_1 (median [range]) | 8.59 [0.22, 52.43] | 8.01 [0.82, 49.35] | 0.386 | nonnorm |
| dose_group (%) | 0.527 | |||
| 10mg/kg | 224 (33.3) | 75 (33.3) | ||
| 15mg/kg | 230 (34.2) | 69 (30.7) | ||
| 5mg/kg | 218 (32.4) | 81 (36.0) | ||
| weight (median [range]) | 47.42 [15.39, 77.23] | 48.69 [18.62, 77.23] | 0.217 | nonnorm |
| age (median [range]) | 10.01 [2.79, 16.67] | 10.18 [2.79, 16.67] | 0.626 | nonnorm |
| dose (median [range]) | 466.65 [76.97, 1158.50] | 477.79 [142.19, 946.57] | 0.661 | nonnorm |
| event = 1 (%) | 505 (75.1) | 169 (75.1) | 1.000 | |
| diff_conc (median [range]) | 2.20 [0.02, 18.68] | 1.85 [0.16, 19.51] | 0.177 | nonnorm |
| dev | val | p | test | |
|---|---|---|---|---|
| n | 672 | 225 | ||
| conc_0 (median [IQR]) | 6.18 [3.27, 10.55] | 5.78 [3.50, 9.65] | 0.627 | nonnorm |
| conc_1 (median [IQR]) | 8.59 [4.75, 14.68] | 8.01 [5.09, 13.81] | 0.386 | nonnorm |
| dose_group (%) | 0.527 | |||
| 10mg/kg | 224 (33.3) | 75 (33.3) | ||
| 15mg/kg | 230 (34.2) | 69 (30.7) | ||
| 5mg/kg | 218 (32.4) | 81 (36.0) | ||
| weight (median [IQR]) | 47.42 [41.69, 53.70] | 48.69 [42.34, 53.62] | 0.217 | nonnorm |
| age (median [IQR]) | 10.01 [8.20, 11.88] | 10.18 [8.43, 11.70] | 0.626 | nonnorm |
| dose (median [IQR]) | 466.65 [266.11, 648.97] | 477.79 [268.90, 620.34] | 0.661 | nonnorm |
| event = 1 (%) | 505 (75.1) | 169 (75.1) | 1.000 | |
| diff_conc (median [IQR]) | 2.20 [1.04, 4.55] | 1.85 [1.00, 3.69] | 0.177 | nonnorm |
##hyperparameters
set.seed(2345)
folds <- vfold_cv(isavuco_ml_train, strata = event)# by default 1 time but use repeats= to change, v=10 fois, strat to keep the initial distribution
###resample#####
set.seed(456)
folds_cv <- vfold_cv(isavuco_ml_train, strata = event)#par défaut 10 fois
https://recipes.tidymodels.org/reference/index.html
Normalisations usuelles à appliquer par algortihme utilisés https://www.tmwr.org/pre-proc-table.html
isavuco_ml_rec <- recipe(event ~ ., data = isavuco_ml_train) %>%
update_role(id, new_role = "id") %>%
step_rm(dose_group) %>%
step_log(contains("conc"), offset = 0.0001) %>%
step_YeoJohnson(dose) %>%
step_normalize(all_numeric_predictors()) # %>%
# step_downsample(event)
# step_smote(event)
# si categorical cov, one hot encoding avec step_dummy --> allonge modele rf or xgb : garder factor dans ces modeles
# si trop de categories reduire avec step_other
isavuco_ml_rec_prep <- prep(isavuco_ml_rec )
isavuco_train_recipe <-bake(isavuco_ml_rec_prep, new_data = NULL)
isavuco_test_recipe <-bake(isavuco_ml_rec_prep, new_data = isavuco_ml_test)
# plot proportion
isavuco_train_recipe %>%
ggplot((aes(x=event))) +
geom_bar() +
theme_bw()
Encoding particulier des variables si nombreux facteurs, on peut remplacer par la valeur moyenne d’outcome (regression) ou la probabilité (classif). Quand peu de valeurs dans une categories, on utilise du partial pooling avec des modeles mixtes –> shrinke ces valeurs vers la moyenne
Example non run of encoding a factor using the mean of the outcome
library(embed)
# step_lencode_glm(variable_facteur, outcome = vars(outcome)) %>%
#step_lencode_mixed(variable_facteur , outcome = vars(outcome)) %>%
# necessite d'etre tune pour eviter overfitting
#et peu gerer des nouvelles categories
# pour voir valeurs encodées
# glm_estimates <-
# prep(nom_recipe) %>%
# tidy(number = 2)# 2 etant la position du step dans la recipe
https://parsnip.tidymodels.org/reference/boost_tree.html
Parameters to tune
mtry A number for the number (or proportion) of predictors that will be randomly sampled at each split when creating the tree models
trees An integer for the number of trees contained in the ensemble.
min_n An integer for the minimum number of data points in a node that is required for the node to be split further.
tree_depth An integer for the maximum depth of the tree (i.e. number of splits) (specific engines only).
learn_rate A number for the rate at which the boosting algorithm adapts from iteration-to-iteration (specific engines only). This is sometimes referred to as the shrinkage parameter.
sample_size A number for the number (or proportion) of data that is exposed to the fitting routine. For xgboost, the sampling is done at each iteration
# #model
xgb_spec <- boost_tree(mode = "classification",
mtry = tune(),
trees = tune(),
min_n = tune(),
sample_size = tune(),
tree_depth = tune(),
learn_rate = tune()) %>%
set_engine("xgboost")
#workflow model+recipe
xgb_wf <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(xgb_spec)
#
Example of tuning with finetune
Race approach: here, the tuning process evaluates all models on an initial subset of resamples. Based on their current performance metrics, some parameter sets are not considered in subsequent resamples.
# library(finetune)
#
# set.seed(234)
# tune_xgb_ft <-
# tune_race_anova(
# xgb_wf,
# folds,
# grid = 50,
# metrics = metric_set(mn_log_loss, accuracy),
# control = control_race(verbose_elim = TRUE)
# )
#
# tune_xgb_ft
# plot_race(tune_xgb_ft)
#define metrics of interezst
# Groups are respected on the new metric function
#class_metrics <- metric_set(accuracy, roc_auc, f_meas_beta2)
library(doParallel)
# Step 1: Set up parallel backend
num_cores <- parallel::detectCores() - 1 # Use all but one core to leave resources for the system
cl <- makeCluster(num_cores) # Create a cluster with the specified number of cores
registerDoParallel(cl) # Register the cluster for parallel processing
# Inform the user about the parallel setup
message("Parallel backend set up with ", num_cores, " cores.")
# Step 2: Define the tuning process with parallel control
set.seed(234) # Set seed for reproducibility
tune_xgb <- tune_grid(
xgb_wf, # Workflow object
resamples = folds, # Cross-validation folds
grid = 60, # Number of tuning parameter combinations
#metrics = class_metrics, # speciy the metrics of interest
control = control_grid(
verbose = TRUE, # Display progress
allow_par = TRUE, # Enable parallel processing
parallel_over = "everything" , # Parallelize across resamples and grid combinations
save_pred = TRUE,
save_workflow = TRUE
)
)
# Step 3: Stop the cluster after tuning
stopCluster(cl) # Shut down the parallel cluster
registerDoSEQ() # Revert to sequential execution
message("Parallel backend stopped and reverted to sequential execution.")
#View results resultats
autoplot(tune_xgb, scientific = FALSE) +
theme_bw() +
ggtitle("tuning hyperparameter")
During the search process, predict which values to test next. These methods can be used in a second time after a classical tune_grid to optimise the hyperparameters
The process of using simulated annealing starts with an initial value and embarks on a controlled random walk through the parameter space. Each new candidate parameter value is a small perturbation of the previous value that keeps the new point within a local neighborhood.
# simulated annealing optimisation (library(finetune))
set.seed(1404)
xgb_sa <-
xgb_wf %>%
tune_sim_anneal(
resamples = folds,
initial = tune_xgb,
iter = 30,
control = control_sim_anneal(verbose = TRUE, no_improve = 10L)
)
# visualisation perfs
autoplot(xgb_sa, type = "performance")
# visualisation params
autoplot(xgb_sa, type = "parameters")
#visualisation des meilleures combinaisons
show_best(xgb_sa, metric = "accuracy")
## # A tibble: 5 × 13
## mtry trees min_n tree_depth learn_rate sample_size .metric .estimator mean
## <int> <int> <int> <int> <dbl> <dbl> <chr> <chr> <dbl>
## 1 2 1347 4 5 0.00237 0.702 accuracy binary 0.930
## 2 7 1571 3 14 0.0131 0.606 accuracy binary 0.930
## 3 6 1571 5 12 0.0286 0.611 accuracy binary 0.929
## 4 6 1879 3 13 0.0262 0.496 accuracy binary 0.929
## 5 2 1783 5 7 0.00563 0.890 accuracy binary 0.927
## # ℹ 4 more variables: n <int>, std_err <dbl>, .config <chr>, .iter <int>
#choix du best model
best_rmse_xgb <- select_best(xgb_sa, metric = "accuracy")
final_xgb <- finalize_model(
xgb_spec,
best_rmse_xgb
)
final_xgb
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = 2
## trees = 1347
## min_n = 4
## tree_depth = 5
## learn_rate = 0.00237340318254643
## sample_size = 0.701563181410132
##
## Computational engine: xgboost
#finalize workflow (fitted)
final_wf_xgb <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_xgb) %>%
fit(isavuco_ml_train)
#finalize workflow (non fitted for last fit)
final_wf_xgb_non_fit <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_xgb)
Allows to evaluate which variable are of interest and their weights
library(vip)
xgb_fit <- extract_fit_parsnip(final_wf_xgb)
vip(xgb_fit, geom = "point", num_features = 20) + theme_bw()
## 10 fold CV
xgb_rs<- fit_resamples(object = final_wf_xgb, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))
##perf resample
xgb_rs %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.927 10 0.00680 Preprocessor1_Model1
## 2 brier_class binary 0.0633 10 0.00489 Preprocessor1_Model1
## 3 roc_auc binary 0.965 10 0.00583 Preprocessor1_Model1
xgb_rs%>% collect_predictions() %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
xgb_rs%>% collect_predictions() %>% sens(event, .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.784
xgb_rs%>% collect_predictions() %>% roc_curve(event, .pred_0) %>% autoplot()
Fit and save wf for future use
##fit workflow necessaire pour faire prédiciton à partir de l'objet
fit_workflow <- fit(final_wf_xgb, isavuco_ml_train)
# ex prediciton à partir du wf pour 3 first patients de train
set.seed(1234)
augment(fit_workflow, isavuco_ml_train %>% slice_head(n = 3))
## # A tibble: 3 × 13
## .pred_class .pred_0 .pred_1 id conc_0 conc_1 dose_group weight age dose
## <fct> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 0 0.941 0.0587 9_5 0.0716 1.20 5mg/kg 46.2 7.83 231.
## 2 0 0.896 0.104 12_5 3.79 5.84 5mg/kg 55.3 14.0 277.
## 3 0 0.857 0.143 15_5 4.05 5.84 5mg/kg 58.0 12.2 290.
## # ℹ 3 more variables: event <fct>, diff_conc <dbl>, ratio_conc <dbl>
# sauvegarde pour une utilisation ultérieure
saveRDS(fit_workflow, file = str_c("xgboost_classif_workshop_save_",today(),".rds"))
Example of prediction in a new patient
## prediction
# dput(names(isavuco_ml_train))
nouveau_patient <- tibble(id = "toto", conc_0 = 2.8, conc_1 = 5.8, dose_group = "5mgkg", weight = 20,
age = 5, dose = 100, event = 0, diff_conc = conc_1 - conc_0, ratio_conc = conc_1/conc_0)
augment(fit_workflow, nouveau_patient)
## # A tibble: 1 × 13
## .pred_class .pred_0 .pred_1 id conc_0 conc_1 dose_group weight age dose
## <fct> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 0 0.856 0.144 toto 2.8 5.8 5mgkg 20 5 100
## # ℹ 3 more variables: event <dbl>, diff_conc <dbl>, ratio_conc <dbl>
https://parsnip.tidymodels.org/reference/linear_reg
penalty A non-negative number representing the total amount of regularization (specific engines only).
mixture A number between zero and one (inclusive) denoting the proportion of L1 regularization (i.e. lasso) in the model.
mixture = 1 specifies a pure lasso model,
mixture = 0 specifies a ridge regression model, and
0 < mixture < 1 specifies an elastic net model, interpolating lasso and ridge.
Here Mixture = 1 allows to have a LASSO penalisation while mixture = 0 is the ridge penalisation
# #model
##nouveau parametre stop_iter
lm_spec <- logistic_reg(mode = "classification",
penalty = tune(),
mixture = 1
) %>% set_engine("glmnet")
#workflow model+recipe
lm_wf<- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(lm_spec)
# dans le cas d'un modele lineaire non penalise sans preprocessing , on peut aussi utiliser à la place d'add recipe
# add_formula(auc ~ .)
#tuning
# narrower penalty range
narrower_penalty <- penalty(range = c(-3, 0))
set.seed(345)
tune_lm <- tune_grid(
lm_wf,
resamples = folds,
grid = 10,
param_info = parameters(narrower_penalty),
control = control_grid(
verbose = TRUE, # Display progress
allow_par = TRUE, # Enable parallel processing
save_pred = TRUE,
save_workflow = TRUE
)
)
#visualisatin resultats
autoplot(tune_lm) +
ggtitle("tuning hyperparameter")
#choix du best model
# best_rmse_lm <- select_best(tune_lm, "rmse", maximize = F)
# choix du modele le plus simple
best_penalty <-
tune_lm %>%
select_best(metric = "accuracy")
best_penalty
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.00143 Preprocessor1_Model01
final_lm <- finalize_model(
lm_spec,
best_penalty
)
final_lm
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 0.00142539757010873
## mixture = 1
##
## Computational engine: glmnet
#finalize workflow
final_wf_lm <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_lm) %>%
fit(isavuco_ml_train)
#vip plot
lm_fit <- extract_fit_parsnip(final_wf_lm)
vip(lm_fit, geom = "point", num_features = 20) + theme_bw()
set.seed(123)
lm_rs<- fit_resamples(object = final_wf_lm, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))
##perf resample
lm_rs %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.911 10 0.00979 Preprocessor1_Model1
## 2 brier_class binary 0.0709 10 0.00578 Preprocessor1_Model1
## 3 roc_auc binary 0.954 10 0.00651 Preprocessor1_Model1
lm_rs %>% collect_predictions() %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
##fit workflow
fit_workflow_lm <- fit(final_wf_lm, isavuco_ml_train)
saveRDS(fit_workflow_lm, file = str_c("wf_classification_lm_workshop_save_",today(),".rds"))
Multivariate Adaptive Regression Splines, or MARS, is an algorithm for complex non-linear regression problems. The MARS algorithm involves discovering a set of simple piecewise linear functions that characterize the data and using them in aggregate to make a prediction. In a sense, the model is an ensemble of linear functions. Each function is piecewise linear, with a knot at the value t. In the terminology of […], these are linear splines. How was the cut point determined? Each data point for each predictor is evaluated as a candidate cut point by creating a linear regression model with the candidate features, and the corresponding model error is calculated. Once the full set of features has been created, the algorithm sequentially removes individual features that do not contribute significantly to the model equation. This “pruning” procedure assesses each predictor variable and estimates how much the error rate was decreased by including it in the model.
https://parsnip.tidymodels.org/reference/details_mars_earth.html
num_terms The number of features that will be retained in the final model, including the intercept.
prod_degree The highest possible interaction degree.
# #model
##nouveau parametre stop_iter
mars_spec <- mars(mode = "classification",
num_terms = tune(),
prod_degree = tune()
) %>% set_engine("earth")
#workflow model+recipe
mars_wf<- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(mars_spec)
#
##hyperparameters
set.seed(2345)
folds <- vfold_cv(isavuco_ml_train, strata = event)#par défaut 10 fois
#tuning
set.seed(345)
tune_mars <- tune_grid(
mars_wf,
resamples = folds,
grid = 20,
control = control_grid(
verbose = TRUE, # Display progress
allow_par = TRUE, # Enable parallel processing
save_pred = TRUE,
save_workflow = TRUE
)
)
#visualisatin resultats
autoplot(tune_mars) +
ggtitle("tuning hyperparameter")
#choix du best model
best_rmse_mars <- select_best(tune_mars)
final_mars <- finalize_model(
mars_spec,
best_rmse_mars
)
final_mars
## MARS Model Specification (classification)
##
## Main Arguments:
## num_terms = 5
## prod_degree = 1
##
## Computational engine: earth
#finalize workflow
final_wf_mars <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_mars) %>%
fit(isavuco_ml_train)
#VIP plot
mars_fit <- extract_fit_parsnip(final_wf_mars)
vip(mars_fit, geom = "point", num_features = 20) + theme_bw()
###resample#####
set.seed(123)
mars_rs<- fit_resamples(object = final_wf_mars, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))
##perf resample
mars_rs %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.902 10 0.0113 Preprocessor1_Model1
## 2 brier_class binary 0.0720 10 0.00523 Preprocessor1_Model1
## 3 roc_auc binary 0.955 10 0.00625 Preprocessor1_Model1
mars_rs %>%
collect_predictions() %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
##fit workflow
fit_workflow_mars <- fit(final_wf_mars, isavuco_ml_train)
saveRDS(fit_workflow_mars, file = str_c("wf_classification_mars_workshop_save_",today(),".rds"))
SVM works by mapping data to a high-dimensional feature space so that data points can be categorized, even when the data are not otherwise linearly separable. A separator between the categories is found, then the data are transformed in such a way that the separator could be drawn as a hyperplane. Following this, characteristics of new data can be used to predict the group to which a new record should belong.
https://parsnip.tidymodels.org/reference/svm_poly.html
#model
##nouveau parametre stop_iter
svm_spec <- svm_linear(mode = "classification",
cost = tune()
) %>% set_engine("kernlab", importance = "permutation")
#workflow model+recipe
svm_wf<- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(svm_spec)
#
##hyperparameters
set.seed(2345)
folds <- vfold_cv(isavuco_ml_train, strata = event)#par défaut 10 fois
#tuning
set.seed(345)
tune_svm <- tune_grid(
svm_wf,
resamples = folds,
grid = 10,
control = control_grid(
verbose = TRUE, # Display progress
allow_par = TRUE, # Enable parallel processing
save_pred = TRUE,
save_workflow = TRUE
)
)
#visualisatin resultats
autoplot(tune_svm, metric = "accuracy") +
ggtitle("tuning hyperparameter")
#choix du best model
best_rmse_svm <- select_best(tune_svm)
final_svm <- finalize_model(
svm_spec,
best_rmse_svm
)
final_svm
## Linear Support Vector Machine Model Specification (classification)
##
## Main Arguments:
## cost = 1.57447597274493
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: kernlab
#visualisation des résultats
# final_svm %>%
# set_engine("kernlab", importance = "permutation") %>%
# fit(auc ~ .,
# data = juice(isavuco_ml_rec_prep)
# ) %>%
# vip::vip(geom = "col")
#finalize workflow
final_wf_svm <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_svm)
###resample#####
set.seed(456)
folds_cv<- vfold_cv(isavuco_ml_train, strata = event)#par défaut 10 fois
set.seed(123)
svm_rs<- fit_resamples(object = final_wf_svm, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))
##perf resample
svm_rs %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.909 10 0.00968 Preprocessor1_Model1
## 2 brier_class binary 0.0711 10 0.00576 Preprocessor1_Model1
## 3 roc_auc binary 0.956 10 0.00645 Preprocessor1_Model1
svm_rs %>%
collect_predictions() %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
##fit workflow
fit_workflow_svm <- fit(final_wf_svm, isavuco_ml_train)
## Setting default kernel parameters
saveRDS(fit_workflow_svm, file = "wf_svm_classif_isavuco_080323.rds")
Tuning Parameters This model has 6 tuning parameters:
hidden_units: # Hidden Units (type: integer, default: 3L)
penalty: Amount of Regularization (type: double, default: 0.0)
epochs: # Epochs (type: integer, default: 100L)
dropout: Dropout Rate (type: double, default: 0.0)
learn_rate: Learning Rate (type: double, default: 0.01)
activation: Activation Function (type: character, default: ‘relu’)
#model
##nouveau parametre stop_iter
mlp_spec <- mlp(mode = "classification",
hidden_units = tune(),
dropout = tune(),
learn_rate = tune()
) %>% set_engine("brulee", importance = "permutation")
#workflow model+recipe
mlp_wf<- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(mlp_spec)
#
library(doParallel)
# Step 1: Set up parallel backend
num_cores <- parallel::detectCores() - 1 # Use all but one core to leave resources for the system
cl <- makeCluster(num_cores) # Create a cluster with the specified number of cores
registerDoParallel(cl) # Register the cluster for parallel processing
# Inform the user about the parallel setup
message("Parallel backend set up with ", num_cores, " cores.")
# Step 2: Define the tuning process with parallel control
set.seed(234) # Set seed for reproducibility
tune_mlp <- tune_grid(
mlp_wf,
resamples = folds,
grid = 30,
control = control_grid(
allow_par = TRUE, # Enable parallel processing
parallel_over = "everything", # Parallelize across resamples and grid combinations,
verbose = TRUE,
save_pred = TRUE,
save_workflow = TRUE
)
)
# Step 3: Stop the cluster after tuning
stopCluster(cl) # Shut down the parallel cluster
registerDoSEQ() # Revert to sequential execution
message("Parallel backend stopped and reverted to sequential execution.")
#visualisatin resultats
#visualisatin resultats
autoplot(tune_mlp, metric = "accuracy") +
ggtitle("tuning hyperparameter")
#choix du best model
best_rmse_mlp <- select_best(tune_mlp)
final_mlp <- finalize_model(
mlp_spec,
best_rmse_mlp
)
final_mlp
## Single Layer Neural Network Model Specification (classification)
##
## Main Arguments:
## hidden_units = 7
## dropout = 0.710316353694846
## learn_rate = 0.240697044141523
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: brulee
#finalize workflow
final_wf_mlp <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_mlp) %>%
fit(isavuco_ml_train)
#vip
mlp_fit <- extract_fit_parsnip(final_wf_mlp)
#vip(mlp_fit, geom = "point", num_features = 20) + theme_bw()
###resample#####
set.seed(123)
mlp_rs<- fit_resamples(object = final_wf_mlp, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))
##perf resample
mlp_rs %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.847 7 0.0235 Preprocessor1_Model1
## 2 brier_class binary 0.0952 7 0.00769 Preprocessor1_Model1
## 3 roc_auc binary 0.947 7 0.0130 Preprocessor1_Model1
mlp_rs %>%
collect_predictions() %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
##fit workflow
fit_workflow_mlp <- fit(final_wf_mlp, isavuco_ml_train)
saveRDS(fit_workflow_mlp, file = "wf_mlp_classif_isavuco_080323.rds")
Tuning Parameters This model has 3 tuning parameters:
mtry: # Randomly Selected Predictors (type: integer, default: see below)
trees: # Trees (type: integer, default: 500L)
min_n: Minimal Node Size (type: integer, default: see below)
#model
##nouveau parametre stop_iter
rf_spec <- rand_forest(mode = "classification",
mtry = tune(),
trees = 1000,
min_n = tune()
) %>% set_engine("ranger", importance = "permutation")
#workflow model+recipe
rf_wf<- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(rf_spec)
#
#tuning
set.seed(345)
tune_rf <- tune_grid(
rf_wf,
resamples = folds,
grid = 20,
control = control_grid(
verbose = TRUE, save_pred = TRUE, save_workflow = TRUE
)
)
#visualisatin resultats
autoplot(tune_rf, metric = "accuracy") +
ggtitle("tuning hyperparameter")
#choix du best model
best_rmse_rf <- select_best(tune_rf)
final_rf <- finalize_model(
rf_spec,
best_rmse_rf
)
final_rf
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 2
## trees = 1000
## min_n = 15
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: ranger
#finalize workflow
final_wf_rf <- workflow() %>%
add_recipe(isavuco_ml_rec) %>%
add_model(final_rf) %>%
fit(isavuco_ml_train)
#vip
rf_fit <- extract_fit_parsnip(final_wf_rf)
vip(rf_fit, geom = "point", num_features = 20) + theme_bw()
###resample#####
set.seed(123)
rf_rs<- fit_resamples(object = final_wf_rf, resamples = folds_cv, control = control_resamples(verbose=TRUE, save_pred = TRUE, save_workflow = TRUE))
##perf resample
rf_rs %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.930 10 0.00789 Preprocessor1_Model1
## 2 brier_class binary 0.0610 10 0.00459 Preprocessor1_Model1
## 3 roc_auc binary 0.964 10 0.00588 Preprocessor1_Model1
rf_rs %>%
collect_predictions() %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
##fit workflow
fit_workflow_rf <- fit(final_wf_rf, isavuco_ml_train)
saveRDS(fit_workflow_rf, file = "wf_rf_classif_isavuco_080323.rds")
https://www.tidymodels.org/find/parsnip/
For example random_forest: https://parsnip.tidymodels.org/reference/rand_forest.html Hyperparameters to tune * mtry An integer for the number of predictors that will be randomly sampled at each split when creating the tree models.
trees An integer for the number of trees contained in the ensemble.
min_n An integer for the minimum number of data points in a node that are required for the node to be split further. null_model: https://parsnip.tidymodels.org/reference/null_model.html
Use of rf
## last fit
final_res <- final_wf_rf %>% #mettre wf meilleures perf
augment(isavuco_ml_test)
final_res %>%
conf_mat(event, .pred_class) %>%
autoplot(type = "heatmap")
# accuracy, sens, spec, prec ision, recall=sens, auc roc
final_res %>% accuracy(event, .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.907
final_res %>% yardstick::sens(event, .pred_class) #, event_level="second", estimator = "binary")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.75
final_res %>% yardstick::spec(event, .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 spec binary 0.959
final_res %>% yardstick::precision(event, .pred_class)# precision = vpp
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.857
final_res %>% yardstick::recall(event, .pred_class)# recall = sensitivty
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.75
final_res %>% yardstick::npv(event, .pred_class)# recall = sensitivty
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 npv binary 0.920
final_res %>% roc_auc(event, .pred_0)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.967
final_res %>% pr_auc(event, .pred_0)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 pr_auc binary 0.917
#precision recall curve
final_res %>% pr_curve(event, .pred_0) %>% autoplot()
# plot roc curve
# roc curve
roc_curve_xgb <- final_res %>%
roc_curve(event, .pred_0) %>%
ggplot(aes(1 - specificity, sensitivity)) +
geom_abline(lty = 2, color = "gray80", linewidth = 1.5) +
geom_path(alpha = 0.8, size = 1.2) +
coord_equal() + theme_bw()
roc_curve_xgb
library(vip)
rf_fit <- extract_fit_parsnip(final_wf_rf)
vip(rf_fit, geom = "point", num_features = 5) + theme_bw()
Not supported for RF
#
library(shapviz)
#
# isavuco_ml_rec <- recipe(event ~ ., data = isavuco_ml_train) %>%
# update_role(id, new_role = "id") %>%
# step_rm(dose_group) %>%
# step_YeoJohnson(contains("conc"), dose) %>%
# step_normalize(all_numeric_predictors()) %>%
# step_downsample()
# # si categorical cov, one hot encoding avec step_dummy --> allonge modele rf or xgb : garder factor dans ces modeles
# # si trop de categories reduire avec step_other
#
# isavuco_ml_rec_prep <- prep(isavuco_ml_rec )
# isavuco_train_recipe <-bake(isavuco_ml_rec_prep, new_data = NULL)
# isavuco_test_recipe <-bake(isavuco_ml_rec_prep, new_data = isavuco_ml_test)
xgb_test_recipe_shap <- bake(
prep(isavuco_ml_rec),
has_role("predictor"),
new_data = isavuco_ml_test,
composition = "matrix"
)
shap <- shapviz(extract_fit_engine(final_wf_xgb), X_pred = xgb_test_recipe_shap, X = isavuco_test_recipe)
# Generate SHAP importance plots
shap_imp <- sv_importance(shap, kind = "both", show_numbers = TRUE, max_display = 40L) +
ggtitle("SHAP Importance")
# Display the plots
print(shap_imp)
dependence plots
# Generate dependence plots for each class
dep_plot <- sv_dependence(shap, "ratio_conc", color_var = "dose") +
ggtitle("Dependence Plot")
# Display the plots
print(dep_plot)
individual force plot
# Generate force plot for Class 1
force_plot <- sv_force(shap, row_id = 1) +
ggtitle("Force Plot")
# Display the plots
print(force_plot)
## creation of explainer
explainer_external <- explain_tidymodels(
model = final_wf_rf,
data = isavuco_ml_train,
y = isavuco_ml_train$event,
label = "rf")
## Preparation of a new explainer is initiated
## -> model label : rf
## -> data : 672 rows 10 cols
## -> data : tibble converted into a data.frame
## -> target variable : 672 values
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.1.1 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a factor . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.008245485 , mean = 0.7510106 , max = 1
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = NA , mean = NA , max = NA
## A new explainer has been created!
https://github.com/pbiecek/breakDown & https://arxiv.org/abs/1804.01955
There are multiple possible approaches to understanding why a model predicts a given class. One is a break-down explanation: it computes how contributions attributed to individual features change the mean model’s prediction for a particular observation.
bd_plot <- predict_parts(explainer = explainer_external,
new_observation = slice_sample(isavuco_ml_test, n=1),
type = "break_down")
plot(bd_plot, max_features = 5)
Partial dependence profiles show how the expected value of a model prediction changes as a function of a feature.
pdp_diff_conc_12_0 <- model_profile(
explainer_external,
variables = "diff_conc",
N = NULL # nombre observation a utilsier de notre trainiong set si null=all
)
plot(pdp_diff_conc_12_0)
pdp_dose <- model_profile(
explainer_external,
variables = "dose",
N = NULL # nombre observation a utilsier de notre trainiong set si null=all
)
plot(pdp_dose)
If a model result indicated that you had a 51% chance of having contracted COVID-19, it would be natural to view the diagnosis with some skepticism
test_pred <- augment(final_wf_rf, isavuco_ml_test) %>%
mutate(missclassified = if_else(.pred_class == event, 0, 1),
missclassified = as.factor(missclassified))
test_pred %>%
ggplot(aes(x = .pred_1, fill = missclassified)) +
geom_histogram()
## creation d'une zone equivoque
library(probably)
lvls <- levels(isavuco_ml_train$event)
test_pred <-
test_pred %>%
mutate(.pred_with_eqz = make_two_class_pred(.pred_0, lvls, buffer = 0.06))
test_pred %>% count(.pred_with_eqz) # Rows that are within 0.50±0.15 are given a value of [EQ]
## # A tibble: 3 × 2
## .pred_with_eqz n
## <clss_prd> <int>
## 1 [EQ] 8
## 2 0 44
## 3 1 173
# All data
test_pred %>% conf_mat(event, .pred_class)
## Truth
## Prediction 0 1
## 0 42 7
## 1 14 162
# Reportable results only:
test_pred %>% conf_mat(event, .pred_with_eqz)
## Truth
## Prediction 0 1
## 0 40 4
## 1 14 159
Possibility to tune the threshold for equivocal
# A function to change the buffer then compute performance.
eq_zone_results <- function(buffer) {
test_pred <-
test_pred %>%
mutate(.pred_with_eqz = make_two_class_pred(.pred_0, lvls, buffer = buffer))
acc <- test_pred %>% accuracy(event, .pred_with_eqz)
rep_rate <- reportable_rate(test_pred$.pred_with_eqz)
tibble(accuracy = acc$.estimate, reportable = rep_rate, buffer = buffer)
}
# Evaluate a sequence of buffers and plot the results.
map_dfr(seq(0, .1, length.out = 20), eq_zone_results) %>%
pivot_longer(c(-buffer), names_to = "statistic", values_to = "value") %>%
ggplot(aes(x = buffer, y = value, lty = statistic)) +
geom_step(size = 1.2, alpha = 0.8) +
labs(y = NULL, lty = NULL)+
theme_bw()
The approach used is a fairly simple unsupervised method that attempts to measure how much (if any) a new data point is beyond the training data. The idea is to accompany a prediction with a score that measures how similar the new point is to the training set. A percentile can be computed for new samples that reflect how much of the training set is less extreme than the new samples.
library(applicable)
pca_stat <- apd_pca(isavuco_ml_rec, data = isavuco_ml_train , threshold = 0.95)
pca_stat
## # Predictors:
## 8
## # Principal Components:
## 4 components were needed
## to capture at least 95% of the
## total variation in the predictors.
autoplot(pca_stat, distance) + labs(x = "distance")
Compute percentile in new data
score(pca_stat, isavuco_ml_test %>% slice_sample(n=1)) %>% select(starts_with("distance")) %>% arrange(desc(distance_pctl))
## # A tibble: 1 × 2
## distance distance_pctl
## <dbl> <dbl>
## 1 1.75 29.2
conflicted::conflicts_prefer(kernlab::coef)
set.seed(1234)
isavuco_classif_model_st <-
# initialize the stack
stacks() %>%
# add candidate members
add_candidates(tune_xgb) %>%
add_candidates(tune_lm) %>%
add_candidates(tune_mars) %>%
add_candidates(tune_svm) %>%
add_candidates(tune_rf) %>%
# determine how to combine their predictions
blend_predictions(metric = metric_set(accuracy)) %>%
# fit the candidates with nonzero stacking coefficients
fit_members()
## Setting default kernel parameters
isavuco_classif_model_st
## # A tibble: 9 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 .pred_1_tune_xgb_1_45 boost_tree 28.7
## 2 .pred_1_tune_rf_1_08 rand_forest 4.52
## 3 .pred_1_tune_xgb_1_36 boost_tree 0.935
## 4 .pred_1_tune_mars_1_8 mars 0.779
## 5 .pred_1_tune_rf_1_14 rand_forest 0.704
## 6 .pred_1_tune_svm_1_07 svm_linear 0.617
## 7 .pred_1_tune_lm_1_04 logistic_reg 0.476
## 8 .pred_1_tune_lm_1_02 logistic_reg 0.295
## 9 .pred_1_tune_lm_1_07 logistic_reg 0.204
To make sure that we have the right trade-off between minimizing the number of members and optimizing performance we use autoplot If these results were not good enough, blend_predictions() could be called again with different values of penalty. As it is, blend_predictions() picks the penalty parameter with the numerically optimal results.
autoplot(isavuco_classif_model_st)
autoplot(isavuco_classif_model_st, type = "members")
autoplot(isavuco_classif_model_st, type = "weights")
# ======================================================
# 1️⃣ Generate Predictions: Probability & Class Labels
# ======================================================
# Generate probability predictions (for ROC & PR curves)
isavuco_class_pred_prob <- isavuco_ml_test %>%
bind_cols(predict(isavuco_classif_model_st, ., type = "prob"))
# Generate class predictions (for classification metrics & confusion matrix)
isavuco_class_pred_class <- isavuco_ml_test %>%
bind_cols(predict(isavuco_classif_model_st, ., type = "class"))
# Ensure `event` is a factor with correct levels
isavuco_class_pred_class <- isavuco_class_pred_class %>%
mutate(event = factor(event)) # Ensures classification metrics work properly
# ======================================================
# 2️⃣ Define and Compute Classification Metrics
# ======================================================
# Define classification metrics (for class predictions)
class_metrics <- metric_set(accuracy, f_meas, sens, spec, ppv, npv, f_meas_beta2)
# Compute classification metrics using **class predictions**
classification_results <- isavuco_class_pred_class %>%
class_metrics(truth = event, estimate = .pred_class)
# Compute AUC for each model (for ROC & PR curve legends) using **probability predictions**
auc_values <- isavuco_class_pred_prob %>%
pivot_longer(cols = starts_with(".pred_0"), names_to = "model", values_to = "probability") %>%
group_by(model) %>%
summarise(roc_auc = roc_auc_vec(event, probability),
pr_auc = pr_auc_vec(event, probability))
# Convert AUC values to a formatted string for legend labels
auc_labels <- auc_values %>%
mutate(label = paste0(model, " (ROC AUC: ", round(roc_auc, 3), ", PR AUC: ", round(pr_auc, 3), ")"))
# ======================================================
# 3️⃣ Compute ROC Curve for Probability Predictions
# ======================================================
# Reshape probability predictions for ROC analysis
roc_curve_data <- isavuco_class_pred_prob %>%
pivot_longer(cols = starts_with(".pred_0"), names_to = "model", values_to = "probability") %>%
mutate(probability = as.numeric(probability)) %>% # Ensure numeric
group_by(model) %>%
roc_curve(event, probability) %>%
ungroup() %>%
left_join(auc_labels, by = "model") # Add AUC labels to dataset
# ======================================================
# 4️⃣ Compute Precision-Recall (PR) Curve
# ======================================================
# Reshape probability predictions for PR curve analysis
pr_curve_data <- isavuco_class_pred_prob %>%
pivot_longer(cols = starts_with(".pred_0"), names_to = "model", values_to = "probability") %>%
mutate(probability = as.numeric(probability)) %>% # Ensure numeric
group_by(model) %>%
pr_curve(event, probability) %>%
ungroup() %>%
left_join(auc_labels, by = "model") # Add AUC labels to dataset
# ======================================================
# 5️⃣ Plot ROC Curve (with AUC in Legend)
# ======================================================
roc_curve_plot <- ggplot(roc_curve_data, aes(x = 1 - specificity, y = sensitivity, color = label)) +
geom_abline(lty = 2, color = "gray80", linewidth = 1.2) + # Diagonal reference line
geom_path(size = 1.2, alpha = 0.8) + # ROC curves
labs(title = "ROC Curves for Stacked Model and Members",
x = "1 - Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)",
color = "Model (AUC Values)") +
coord_equal() +
theme_bw()
# ======================================================
# 6️⃣ Plot Precision-Recall (PR) Curve (with AUC in Legend)
# ======================================================
pr_curve_plot <- ggplot(pr_curve_data, aes(x = recall, y = precision, color = label)) +
geom_path(size = 1.2, alpha = 0.8) + # PR curves
labs(title = "Precision-Recall Curves for Stacked Model and Members",
x = "Recall (Sensitivity)",
y = "Precision (Positive Predictive Value)",
color = "Model (AUC Values)") +
theme_bw()
# ======================================================
# 7️⃣ Compute and Display Confusion Matrix (Contingency Table)
# ======================================================
confusion_matrix_results <- isavuco_class_pred_class %>%
conf_mat(truth = event, estimate = .pred_class)
# ======================================================
# 8️⃣ Display All Results
# ======================================================
# Print classification metrics
print(classification_results)
## # A tibble: 7 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.902
## 2 f_meas binary 0.78
## 3 sens binary 0.696
## 4 spec binary 0.970
## 5 ppv binary 0.886
## 6 npv binary 0.906
## 7 f_meas_beta2 binary 0.728
# Print AUC values table
print(auc_values)
## # A tibble: 1 × 3
## model roc_auc pr_auc
## <chr> <dbl> <dbl>
## 1 .pred_0 0.966 0.915
# Print contingency table
print(confusion_matrix_results)
## Truth
## Prediction 0 1
## 0 39 5
## 1 17 164
# Print ROC Curve
print(roc_curve_plot)
# Print PR Curve
print(pr_curve_plot)
# ==========================================
# 1️⃣ Generate Predictions for Each Model Member
# ==========================================
# Generate class predictions for each ensemble member from the stacked model
isavuco_class_pred <- isavuco_ml_test %>%
select(event) %>% # Keep only the target variable (ground truth)
bind_cols(
predict(
isavuco_classif_model_st, # Stacked model
isavuco_ml_test,
type = "class", # Predict class labels
members = TRUE # Include individual model predictions from the stack
)
)
# ==========================================
# 2️⃣ Evaluate Individual Model Performance
# ==========================================
# Compute classification accuracy for each ensemble member
pred_class_isavuco <- map(
colnames(isavuco_class_pred)[-1], # Exclude 'event' column from the loop
~ mean(isavuco_class_pred$event == pull(isavuco_class_pred, .x)) # Compare truth vs predictions
) %>%
set_names(colnames(isavuco_class_pred)[-1]) %>% # Set model names
enframe(name = "model", value = "accuracy") # Convert list to tibble
# ==========================================
# 3️⃣ Display Results in a Table
# ==========================================
rmarkdown::paged_table(pred_class_isavuco %>% unnest())
overall roc curves
# generaiotn of probabilities
# Generate probability predictions for each ensemble member
isavuco_class_pred <- isavuco_ml_test %>%
select(event) %>%
bind_cols(
predict(
isavuco_classif_model_st, # Stacked model
isavuco_ml_test,
type = "prob", # Ensure predictions are probabilities
members = TRUE
)
)
# Reshape the data to long format for multiple ROC curves
roc_curve_data <- isavuco_class_pred %>%
pivot_longer(cols = starts_with(".pred_0"), names_to = "model", values_to = "probability") %>%
mutate(probability = as.numeric(probability)) %>% # Ensure numeric probabilities
group_by(model) %>%
roc_curve(event, probability) %>%
ungroup()
# Plot ROC Curves for all models
roc_curve_plot <- ggplot(roc_curve_data, aes(x = 1 - specificity, y = sensitivity, color = model)) +
geom_abline(lty = 2, color = "gray80", linewidth = 1) + # Diagonal reference line
geom_path(size = 1, alpha = 0.8) + # ROC curves
labs(title = "ROC Curves for Stacked Model and Members",
x = "1 - Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)",
color = "Model") +
coord_equal() +
theme_bw()
roc_curve_plot